In [1]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn import preprocessing
In [2]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots

pio.renderers.default = "notebook"
In [3]:
# Read in the corpus, but note the data type of every column in advance
# we're doing this in order to save up on resources, if the column is strictly typed
# it's going to take way less RAM.
df = pd.read_csv(
    "data/scrubbed_syn_udp_ben.csv",
    dtype={
        "index": int,
        "flow_id": str,
        "src_ip": str,
        "src_port": int,
        "dst_ip": str,
        "dst_port": int,
        "protocol": int,
        "ts": str,
        "flow_duration": int,
        "total_fwd_packets": int,
        "total_backward_packets": int,
        "total_length_fwd_packets": int,
        "total_length_bwd_packets": int,
        "fwd_packet_length_max": int,
        "fwd_packet_length_min": int,
        "fwd_packet_length_mean": float,
        "fwd_packet_length_std": float,
        "bwd_packet_length_max": int,
        "bwd_packet_length_min": int,
        "bwd_packet_length_mean": float,
        "bwd_packet_length_std": float,
        "flow_Bps": float,
        "flow_pps": float,
        "flow_iat_mean": float,
        "flow_iat_std": float,
        "flow_iat_max": int,
        "flow_iat_min": int,
        "flow_iat_total": int,
        "fwd_iat_mean": float,
        "fwd_iat_std": float,
        "fwd_iat_max": int,
        "fwd_iat_min": int,
        "bwd_iat_total": int,
        "bwd_iat_mean": float,
        "bwd_iat_std": float,
        "bwd_iat_max": float,
        "bwd_iat_min": float,
        "fwd_psh_flags": int,
        "bwd_psh_flags": int,
        "fwd_urg_flags": int,
        "bwd_urg_flags": int,
        "fwd_header_length": int,
        "bwd_header_length": int,
        "fwd_pps": float,
        "bwd_pps": float,
        "min_packet_length": int,
        "max_packet_length": int,
        "packet_length_mean": float,
        "packet_length_std": float,
        "packet_length_variance": float,
        "fin_flag_count": int,
        "syn_flag_count": int,
        "rst_flag_count": int,
        "psh_flag_count": int,
        "ack_flag_count": int,
        "urg_flag_count": int,
        "cwe_flag_count": int,
        "ece_flag_count": int,
        "down_up_ratio": int,
        "avg_packet_size": float,
        "avg_fwd_segment_size": float,
        "avg_bwd_segment_size": float,
        "fwd_header_length_1": float,
        "fwd_avg_bytes_bulk": float,
        "fwd_avg_packets_bulk": float,
        "fwd_avg_bulk_rate": float,
        "bwd_avg_bytes_bulk": float,
        "bwd_avg_packets_bulk": float,
        "bwd_avg_bulk_rate": float,
        "subflow_fwd_packets": float,
        "subflow_fwd_bytes": float,
        "subflow_bwd_packets": float,
        "subflow_bwd_bytes": float,
        "init_win_bytes_forward": float,
        "init_win_bytes_backward": float,
        "act_data_pkt_fwd": float,
        "min_seg_size_forward": float,
        "active_mean": float,
        "active_std": float,
        "active_max": int,
        "active_min": int,
        "idle_mean": float,
        "idle_std": float,
        "idle_max": int,
        "idle_min": int,
        "simillar_http": str,
        "inbound": int,
        "label": str,
    },
)
In [4]:
# Get the distribution by label(target variable y)
df_group_count = (
    df.groupby(["label"])
    .count()
    .reset_index()
    .rename(columns={"index": "count"})[["label", "count"]]
)
In [5]:
print(f"Total number of rows: {df_group_count['count'].sum()}")
Total number of rows: 413828
In [6]:
fig = px.bar(
    df_group_count,
    x="label",
    y="count",
    title = 'Row distribution by label',
    color="label",
    template="plotly_white"
)
fig.update_layout(title_x=0.5)
fig.show()
In [7]:
def get_color_list(attributes):
    return list(dict(zip(attributes, px.colors.DEFAULT_PLOTLY_COLORS)).values())
In [8]:
# Various EDA plots.
df_label_means = df.groupby(["label"]).quantile(0.95).reset_index()

fig = make_subplots(rows=1, cols=1, specs=[[{"secondary_y": True}]],)
fig.layout.template = "plotly_white"
fig.add_trace(
    go.Bar(
        x=df_label_means["label"],
        y=df_label_means["flow_duration"],
        name="Flow Duration 95th percentile",
        marker=dict(color=get_color_list(df_label_means["label"]),),
    ),
)

fig.add_trace(
    go.Scatter(
        x=df_label_means["label"],
        y=df_label_means["total_length_fwd_packets"],
        mode="lines",
        name="Total forwarded packets length 95th percentile",
        line=dict(color="black"),
        opacity=0.5,
    ),
    secondary_y=True,
)

# Plot log scale for length
fig.update_yaxes(type="log")
# Plot linear scale for forwarded packets.

fig.update_yaxes(type="linear", secondary_y=True)

fig.update_layout(
    title_x=0.5,
    title_text="95th percentile of flow duration and total forwarded packets length grouped by target variable",
)

fig.show()
In [9]:
fig = make_subplots(rows=1, cols=1, specs=[[{"secondary_y": True}]],)
fig.layout.template = "plotly_white"
fig.add_trace(
    go.Bar(
        x=df_label_means["label"],
        y=df_label_means["fwd_pps"],
        name="95th percentile sent pps",
        marker=dict(color=get_color_list(df_label_means["label"]),),
    ),
)

fig.add_trace(
    go.Scatter(
        x=df_label_means["label"],
        y=df_label_means["bwd_pps"],
        mode="lines",
        name="95th percentile received pps",
        line=dict(color="black"),
        opacity=0.5,
    ),
    secondary_y=True,
)

fig.update_yaxes(type="log")
fig.update_yaxes(type="linear", secondary_y=True)

fig.update_layout(
    title_x=0.5,
    title_text="95th percentile forwarded vs received packets per second 95th percentile",
)

fig.show()
In [10]:
df_labels_quantiles = (
    df.groupby(["label"])
    .quantile(np.arange(0, 1.1, 0.1))
    .reset_index()
    .rename(columns={"level_1": "quantile"})
)
In [11]:
df_labels_quantiles.head()
Out[11]:
label quantile index src_port dst_port protocol flow_duration total_fwd_packets total_backward_packets total_length_fwd_packets ... min_seg_size_forward active_mean active_std active_max active_min idle_mean idle_std idle_max idle_min inbound
0 BENIGN 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... -1.408238e+09 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 BENIGN 0.1 296.0 443.0 53.0 6.0 2.0 1.0 0.0 0.0 ... 2.000000e+01 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 BENIGN 0.2 1240.4 443.0 53.0 6.0 127.0 2.0 0.0 0.0 ... 2.000000e+01 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 BENIGN 0.3 2888.0 50539.0 53.0 6.0 804.1 2.0 2.0 6.0 ... 2.000000e+01 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 BENIGN 0.4 6007.8 52709.0 80.0 6.0 20701.0 2.0 2.0 31.0 ... 2.000000e+01 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 84 columns

In [12]:
px.line(
    df_labels_quantiles,
    x="flow_duration",
    y="total_fwd_packets",
    line_dash="label",
    color="label",
    template="plotly_white",
    title = "Number of sent packets based off of flow duration grouped by target variable"
)
In [13]:
# PCA
scale = preprocessing.StandardScaler()
X = scale.fit_transform(
    df[
        [
            "protocol",
            "flow_duration",
            "total_fwd_packets",
            "total_backward_packets",
            "total_length_fwd_packets",
            "total_length_bwd_packets",
            "fwd_packet_length_max",
            "fwd_packet_length_min",
            "fwd_packet_length_mean",
            "fwd_packet_length_std",
            "bwd_packet_length_max",
            "bwd_packet_length_min",
            "bwd_packet_length_mean",
            "bwd_packet_length_std",
            "flow_Bps",
            "flow_pps",
            "flow_iat_mean",
            "flow_iat_std",
            "flow_iat_max",
            "flow_iat_min",
            "flow_iat_total",
            "fwd_iat_mean",
            "fwd_iat_std",
            "fwd_iat_max",
            "fwd_iat_min",
            "bwd_iat_total",
            "bwd_iat_mean",
            "bwd_iat_std",
            "bwd_iat_max",
            "bwd_iat_min",
            "fwd_psh_flags",
            "bwd_psh_flags",
            "fwd_urg_flags",
            "bwd_urg_flags",
            "fwd_header_length",
            "bwd_header_length",
            "fwd_pps",
            "bwd_pps",
            "min_packet_length",
            "max_packet_length",
            "packet_length_mean",
            "packet_length_std",
            "packet_length_variance",
            "fin_flag_count",
            "syn_flag_count",
            "rst_flag_count",
            "psh_flag_count",
            "ack_flag_count",
            "urg_flag_count",
            "cwe_flag_count",
            "ece_flag_count",
            "down_up_ratio",
            "avg_packet_size",
            "avg_fwd_segment_size",
            "avg_bwd_segment_size",
            "fwd_header_length_1",
            "fwd_avg_bytes_bulk",
            "fwd_avg_packets_bulk",
            "fwd_avg_bulk_rate",
            "bwd_avg_bytes_bulk",
            "bwd_avg_packets_bulk",
            "bwd_avg_bulk_rate",
            "subflow_fwd_packets",
            "subflow_fwd_bytes",
            "subflow_bwd_packets",
            "subflow_bwd_bytes",
            "init_win_bytes_forward",
            "init_win_bytes_backward",
            "act_data_pkt_fwd",
            "min_seg_size_forward",
            "active_mean",
            "active_std",
            "active_max",
            "active_min",
            "idle_mean",
            "idle_std",
            "idle_max",
            "idle_min",
            "inbound",
        ]
    ]
)
X_norm = preprocessing.normalize(X)

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_norm)
In [14]:
df_principalComponents = pd.DataFrame(principalComponents)
df_principalComponents["label"] = df["label"]
df_principalComponents.columns = ["comp_1", "comp_2", "label"]
In [15]:
px.scatter(
    df_principalComponents,
    "comp_1",
    "comp_2",
    color="label",
    opacity=0.3,
    template="plotly_white",
    title = "Principal Component Analysis based off of the relevant attributes"
)
In [16]:
# tsne_ = TSNE(random_state = 42, n_components=2,verbose=2, perplexity=1, n_iter=250).fit_transform(principalComponents)
# df_tsne = pd.DataFrame(tsne_)
# df_tsne['label'] = df['label']
# tsne_scatter.opts(color="label", width=850, alpha=0.5, cmap="Set1")